In [ ]:
from __future__ import print_function, unicode_literals
import gzip
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import dbpedia_config
sns.set_context('poster', font_scale=0.8)
%matplotlib inline
In [ ]:
target_folder = dbpedia_config.TARGET_FOLDER
languages = dbpedia_config.LANGUAGES
In [ ]:
filenames = ['{1}/person_data_{0}.csv.gz'.format(lang, target_folder) for lang in languages]
In [ ]:
all_bios = None
for lang, filename in zip(languages, filenames):
this_edition = pd.read_csv(filename, encoding='utf-8')
this_edition['language'] = lang
if all_bios is None:
all_bios = this_edition
else:
all_bios = pd.concat([all_bios, this_edition])
all_bios.language.value_counts()
In [ ]:
def w_fraction(arr):
#print arr
return np.sum(arr == 'female') / float(len(arr))
col_labels = all_bios.groupby('language').aggregate(
{'edition_count': lambda x: len(x), 'gender': w_fraction, 'available_english': np.mean}
).sort('edition_count', ascending=False)
col_labels['female_median_count'] = [all_bios[(all_bios.language == idx)
& (all_bios.gender == 'female')].edition_count.median()
for idx in col_labels.index]
col_labels['male_median_count'] = [all_bios[(all_bios.language == idx)
& (all_bios.gender == 'male')].edition_count.median()
for idx in col_labels.index]
col_labels['female_mean_count'] = [all_bios[(all_bios.language == idx)
& (all_bios.gender == 'female')].edition_count.mean()
for idx in col_labels.index]
col_labels['male_mean_count'] = [all_bios[(all_bios.language == idx)
& (all_bios.gender == 'male')].edition_count.mean()
for idx in col_labels.index]
col_labels
In [ ]:
all_bios.drop_duplicates(subset=['same_as'], inplace=True)
all_bios.drop_duplicates(subset=['wikidata_entity'], inplace=True)
all_bios.drop_duplicates(subset=['label'], inplace=True)
In [ ]:
all_bios.gender.value_counts()
In [ ]:
all_bios = all_bios[all_bios.gender.isin(['male', 'female'])].copy()
In [ ]:
print(all_bios.shape)
In [ ]:
all_bios.gender.value_counts()
In [ ]:
all_bios.sample(n=5)
In [ ]:
with gzip.open('{0}/consolidated_person_data.csv.gz'.format(target_folder), 'wb') as f:
all_bios.to_csv(f, encoding='utf-8')
In [ ]: